In [1]:

    
from collections import OrderedDict
import datetime

import numpy as np
import pandas as pd

from IPython.display import HTML

import statsmodels.api as sm

from break4w.categorical import Categorical
from break4w.continous import Continous
from break4w.question import Question
from break4w.bool import Bool
from break4w.data_dictionary import DataDictionary

I'm going to try to make a data dictionary object using columns from an example data dictionary and study I worked with a while ago. I'm going to start this by assuming we can convert a text documnt to a series of dictionaries to build off of. I'm going to use the data description from the Statsmodels National Election DataSet.



In [2]:

    
data_ = pd.DataFrame(sm.datasets.anes96.load().data)



In [3]:

    
columns = [
    {
        'name': 'popul',
        'description': 'Census place population in 1000s',
        'dtype': float,
        'units': 'people',
        'magnitude': 1000,
    },
    {
        'name': 'TVnews',
        'description': 'Number of times per week that respondent watches TV news.',
        'dtype': int,
        'units': 'views per week',
        'clean_name': 'TV news',
        'limits': [0, None]
    },
    {
        'name': 'PID',
        'description': 'Party identification of respondent',
        'dtype': int,
        'order': [0, 1, 2, 3, 4, 5, 6],
        'numeric_mapping': {0: 'Strong Democrat',
                            1: 'Weak Democrat', 
                            2: 'Independent-Democrat', 
                            3: 'Independent-Indpendent', 
                            4: 'Independent-Republican', 
                            5: 'Weak Republican', 
                            6: 'Strong Republican'}
    },
    {
        'name': 'vote',
        'description': 'Individual expected to vote for Bob Dole',
        'dtype': bool,
    },
    ]
types = ['continous', 'question', 'categorical']



In [4]:

    
type_lookup = {'continous': Continous,
               'categorical': Categorical,
               'multiple choice': Categorical,
               'ordinal': Categorical,
               'bool': Bool,
               'boolean': Bool,
               'yes/no': Bool,
               }



In [5]:

    
proto_dict = OrderedDict()
for col_, type_ in zip(*(columns, types)):
    question_type = type_lookup.get(type_.lower(), Question)
    proto_dict[col_['name']] = question_type(**col_)
#         proto_dict[col_['name']] = Continous(**col_)
#     elif type_ == 'categorical':
#         proto_dict[col_['name']] = Categorical(**col_)
#     else:
#         proto_dict[col_['name']] = Question(**col_)



In [6]:

    
proto_dict['popul'].to_dict()









    Out[6]:





('continous',
 {'name': 'popul',
  'description': 'Census place population in 1000s',
  'dtype': float,
  'clean_name': 'Popul',
  'units': 'people'})



In [7]:

    
type_lookup = {'continous': Continous,
               'categorical': Categorical,
               'multiple choice': Categorical,
               'ordinal': Categorical,
               'bool': Bool,
               'boolean': Bool,
               'yes/no': Bool,
               }



In [8]:

    
proto_dict









    Out[8]:





OrderedDict([('popul', <break4w.continous.Continous at 0x11a4376a0>),
             ('TVnews', <break4w.question.Question at 0x11a437710>),
             ('PID', <break4w.categorical.Categorical at 0x11a437748>)])



In [ ]:



In [9]:

    
dict_ = DataDictionary(columns, types)



In [10]:

    
print(dict_)









    



Data Dictionary with 3 columns
-----------------------------------------------------------------------------
popul (Continous)
TVnews (Question)
PID (Categorical)
-----------------------------------------------------------------------------



In [11]:

    
df_ = dict_.to_dataframe()



In [12]:

    
df_









    Out[12]:







  
    
      
      description
      dtype
      type
      clean_name
      units
      limits
      numeric_mapping
      order
    
    
      name
      
      
      
      
      
      
      
      
    
  
  
    
      popul
      Census place population in 1000s
      float
      Continous
      Popul
      people
      NaN
      NaN
      NaN
    
    
      TVnews
      Number of times per week that respondent watch...
      int
      Question
      TV news
      views per week
      0 | None
      NaN
      NaN
    
    
      PID
      Party identification of respondent
      int
      Categorical
      Pid
      NaN
      NaN
      {0: 'Strong Democrat', 1: 'Weak Democrat', 2: ...
      0 | 1 | 2 | 3 | 4 | 5 | 6



In [ ]:



In [ ]:

    
df_



In [ ]:

    
test.add_question(columns[0], types[0])
test.add_question(Continous(**columns[1]))
test.add_question(columns[1], types[1])



In [ ]:

    
list(test.columns.keys())



In [ ]:

    
columns = test.columns



In [ ]:

    
columns



In [ ]:

    
del columns['popul']



In [ ]:

    
columns



In [ ]:

    
test.columns



In [ ]:

    
test.log



In [ ]:

    
test.add_question(columns[2], types[2])



In [ ]:

    
current = vars(test['popul'])



In [ ]:

    
new = {'blanks': 'not applicable',
       'frog': 'Chowder'}



In [ ]:

    
change_keys = {}
for k, v in new.items():
    if k in current:
        change_keys[k] = (current[k], v)
    else:
        change_keys[k] = ('add', v)
    setattr(test['popul'], k, v)



In [ ]:

    
test['popul'].frog



In [ ]:

    
check = test['popul']



In [ ]:

    
check.cat = 'None'



In [ ]:

    
check.cat



In [ ]:

    
test['popul'].cat



In [ ]:

    
vars(check)



In [ ]:

	description	dtype	type	clean_name	units	limits	numeric_mapping	order
name
popul	Census place population in 1000s	float	Continous	Popul	people	NaN	NaN	NaN
TVnews	Number of times per week that respondent watch...	int	Question	TV news	views per week	0 \| None	NaN	NaN
PID	Party identification of respondent	int	Categorical	Pid	NaN	NaN	{0: 'Strong Democrat', 1: 'Weak Democrat', 2: ...	0 \| 1 \| 2 \| 3 \| 4 \| 5 \| 6